In [7]:
from IPython.display import display, HTML
from io import StringIO
import requests
import json
import pandas as pd
%matplotlib inline

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [1]:
# The code was removed by DSX for sharing.


Out[1]:
[Row(=1, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='UNIVERSITY OF MICHIGAN', SOC_NAME='BIOCHEMISTS AND BIOPHYSICISTS', JOB_TITLE='POSTDOCTORAL RESEARCH FELLOW', FULL_TIME_POSITION='N', PREVAILING_WAGE='36067', YEAR='2016', WORKSITE='ANN ARBOR, MICHIGAN', lon='-83.7430378', lat='42.2808256'),
 Row(=2, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='GOODMAN NETWORKS, INC.', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='CHIEF OPERATING OFFICER', FULL_TIME_POSITION='Y', PREVAILING_WAGE='242674', YEAR='2016', WORKSITE='PLANO, TEXAS', lon='-96.6988856', lat='33.0198431'),
 Row(=3, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='PORTS AMERICA GROUP, INC.', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='CHIEF PROCESS OFFICER', FULL_TIME_POSITION='Y', PREVAILING_WAGE='193066', YEAR='2016', WORKSITE='JERSEY CITY, NEW JERSEY', lon='-74.0776417', lat='40.7281575'),
 Row(=4, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='GATES CORPORATION, A WHOLLY-OWNED SUBSIDIARY OF TOMKINS PLC', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='REGIONAL PRESIDEN, AMERICAS', FULL_TIME_POSITION='Y', PREVAILING_WAGE='220314', YEAR='2016', WORKSITE='DENVER, COLORADO', lon='-104.990251', lat='39.7392358'),
 Row(=5, CASE_STATUS='WITHDRAWN', EMPLOYER_NAME='PEABODY INVESTMENTS CORP.', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='PRESIDENT MONGOLIA AND INDIA', FULL_TIME_POSITION='Y', PREVAILING_WAGE='157518.4', YEAR='2016', WORKSITE='ST. LOUIS, MISSOURI', lon='-90.1994042', lat='38.6270025')]

In [19]:
df_data_3.head(10)


Out[19]:
[Row(=1, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='UNIVERSITY OF MICHIGAN', SOC_NAME='BIOCHEMISTS AND BIOPHYSICISTS', JOB_TITLE='POSTDOCTORAL RESEARCH FELLOW', FULL_TIME_POSITION='N', PREVAILING_WAGE='36067', YEAR='2016', WORKSITE='ANN ARBOR, MICHIGAN', lon='-83.7430378', lat='42.2808256'),
 Row(=2, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='GOODMAN NETWORKS, INC.', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='CHIEF OPERATING OFFICER', FULL_TIME_POSITION='Y', PREVAILING_WAGE='242674', YEAR='2016', WORKSITE='PLANO, TEXAS', lon='-96.6988856', lat='33.0198431'),
 Row(=3, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='PORTS AMERICA GROUP, INC.', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='CHIEF PROCESS OFFICER', FULL_TIME_POSITION='Y', PREVAILING_WAGE='193066', YEAR='2016', WORKSITE='JERSEY CITY, NEW JERSEY', lon='-74.0776417', lat='40.7281575'),
 Row(=4, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='GATES CORPORATION, A WHOLLY-OWNED SUBSIDIARY OF TOMKINS PLC', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='REGIONAL PRESIDEN, AMERICAS', FULL_TIME_POSITION='Y', PREVAILING_WAGE='220314', YEAR='2016', WORKSITE='DENVER, COLORADO', lon='-104.990251', lat='39.7392358'),
 Row(=5, CASE_STATUS='WITHDRAWN', EMPLOYER_NAME='PEABODY INVESTMENTS CORP.', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='PRESIDENT MONGOLIA AND INDIA', FULL_TIME_POSITION='Y', PREVAILING_WAGE='157518.4', YEAR='2016', WORKSITE='ST. LOUIS, MISSOURI', lon='-90.1994042', lat='38.6270025'),
 Row(=6, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='BURGER KING CORPORATION', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='EXECUTIVE V P, GLOBAL DEVELOPMENT AND PRESIDENT, LATIN AMERI', FULL_TIME_POSITION='Y', PREVAILING_WAGE='225000', YEAR='2016', WORKSITE='MIAMI, FLORIDA', lon='-80.1917902', lat='25.7616798'),
 Row(=7, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='BT AND MK ENERGY AND COMMODITIES', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='CHIEF OPERATING OFFICER', FULL_TIME_POSITION='Y', PREVAILING_WAGE='91021', YEAR='2016', WORKSITE='HOUSTON, TEXAS', lon='-95.3698028', lat='29.7604267'),
 Row(=8, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='GLOBO MOBILE TECHNOLOGIES, INC.', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='CHIEF OPERATIONS OFFICER', FULL_TIME_POSITION='Y', PREVAILING_WAGE='150000', YEAR='2016', WORKSITE='SAN JOSE, CALIFORNIA', lon='-121.8863286', lat='37.3382082'),
 Row(=9, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='ESI COMPANIES INC.', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='PRESIDENT', FULL_TIME_POSITION='Y', PREVAILING_WAGE='127546', YEAR='2016', WORKSITE='MEMPHIS, TEXAS', lon='NA', lat='NA'),
 Row(=10, CASE_STATUS='WITHDRAWN', EMPLOYER_NAME='LESSARD INTERNATIONAL LLC', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='PRESIDENT', FULL_TIME_POSITION='Y', PREVAILING_WAGE='154648', YEAR='2016', WORKSITE='VIENNA, VIRGINIA', lon='-77.2652604', lat='38.9012225')]

In [20]:
(df_data_3[['CASE_STATUS', 'SOC_NAME']]).show()
#use this to make a chart with two indexes


+-------------------+--------------------+
|        CASE_STATUS|            SOC_NAME|
+-------------------+--------------------+
|CERTIFIED-WITHDRAWN|BIOCHEMISTS AND B...|
|CERTIFIED-WITHDRAWN|    CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN|    CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN|    CHIEF EXECUTIVES|
|          WITHDRAWN|    CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN|    CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN|    CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN|    CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN|    CHIEF EXECUTIVES|
|          WITHDRAWN|    CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN|    CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN|    CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN|    CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN|    CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN|    CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN|    CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN|  FINANCIAL MANAGERS|
|CERTIFIED-WITHDRAWN|    CHIEF EXECUTIVES|
|          CERTIFIED|    CHIEF EXECUTIVES|
|          CERTIFIED|    CHIEF EXECUTIVES|
+-------------------+--------------------+
only showing top 20 rows


In [24]:
df_data_3.select('SOC_NAME').count()


Out[24]:
3002458

In [5]:
df_data_2['SOC_NAME'].value_counts().count()


Out[5]:
2132

In [6]:
testdata=df_data_2.copy()

In [7]:
testdata['SOC_NAME']=df_data_2['SOC_NAME'].str.lower()
testdata['EMPLOYER_NAME']=df_data_2['EMPLOYER_NAME'].str.lower()
testdata['JOB_TITLE']=df_data_2['JOB_TITLE'].str.lower()

Here, we transform some strings to lowercase. This is because there are duplicate entries in the dataset which in both upper and lower. This increases redundancy


In [8]:
testdata['SOC_NAME'].value_counts()


Out[8]:
computer systems analysts                                       506523
software developers, applications                               414716
computer programmers                                            398546
computer occupations, all other                                 177870
software developers, systems software                            83856
management analysts                                              66873
financial analysts                                               53540
accountants and auditors                                         52822
mechanical engineers                                             44236
network and computer systems administrators                      40087
database administrators                                          38982
market research analysts and marketing specialists               37737
electronics engineers, except computer                           36574
operations research analysts                                     34260
electrical engineers                                             34108
physicians and surgeons, all other                               33526
computer software engineers, applications                        33387
computer and information systems managers                        27536
computer occupations, all other*                                 26254
medical scientists, except epidemiologists                       26159
physical therapists                                              21994
biochemists and biophysicists                                    21245
industrial engineers                                             19370
computer systems analyst                                         17426
statisticians                                                    17101
biological scientists, all other                                 16367
marketing managers                                               16310
civil engineers                                                  15970
web developers                                                   15000
internists, general                                              13367
                                                                 ...  
helpers--electricians                                                1
industrial designers                                                 1
logistics manager                                                    1
electronics engineers, expert computer                               1
hardward engineers                                                   1
soil and plant scientist                                             1
mechanical engineers, r&d (acwia only)                               1
timing device assemblers and adjusters                               1
telecommunications line installers and repairers                     1
biological science teachers                                          1
dentist                                                              1
computer systems engineers/arquitects                                1
secondary school teachers,                                           1
computer systems anaysts                                             1
counseling psychologist                                              1
software engineer, senior                                            1
health diagnosing & treating practitioners, all ot                   1
set and exhibit engineers                                            1
helpers, construction trades, all other                              1
datebase administrators                                              1
commercial and insdistrial designers                                 1
software developers, appllications                                   1
computer occupations,all other                                       1
foreign language and literature teachers, postsecondary              1
dentists                                                             1
railroad conductors and yardmasters                                  1
computer occupations, all other: information techn                   1
computer network architect                                           1
criminal justice and law enforcement teachers, postsecondary         1
medical scientists, except epidemiologist                            1
Name: SOC_NAME, dtype: int64

There is still alot of redundancy we can exploit. We can generalize these strings remove specalized strings into more general form. For example 'software engineer, senior' needs to be reduced to 'software engineer'. This also applies to the other columns with string attributes.


In [9]:
testdata['SOC_NAME'].value_counts().count()


Out[9]:
1585

In [10]:
df = pd.DataFrame({'SOC_NAME': testdata['SOC_NAME'].value_counts().index, 'Count':testdata['SOC_NAME'].value_counts().values})
#df['Counts'] = df.groupby(['SOC_NAME'])['Count'].transform('count') #I don't remember what I was trying to do here.
df = df.set_index(['SOC_NAME'])
df


Out[10]:
Count
SOC_NAME
computer systems analysts 506523
software developers, applications 414716
computer programmers 398546
computer occupations, all other 177870
software developers, systems software 83856
management analysts 66873
financial analysts 53540
accountants and auditors 52822
mechanical engineers 44236
network and computer systems administrators 40087
database administrators 38982
market research analysts and marketing specialists 37737
electronics engineers, except computer 36574
operations research analysts 34260
electrical engineers 34108
physicians and surgeons, all other 33526
computer software engineers, applications 33387
computer and information systems managers 27536
computer occupations, all other* 26254
medical scientists, except epidemiologists 26159
physical therapists 21994
biochemists and biophysicists 21245
industrial engineers 19370
computer systems analyst 17426
statisticians 17101
biological scientists, all other 16367
marketing managers 16310
civil engineers 15970
web developers 15000
internists, general 13367
... ...
helpers--electricians 1
industrial designers 1
logistics manager 1
electronics engineers, expert computer 1
hardward engineers 1
soil and plant scientist 1
mechanical engineers, r&d (acwia only) 1
timing device assemblers and adjusters 1
telecommunications line installers and repairers 1
biological science teachers 1
dentist 1
computer systems engineers/arquitects 1
secondary school teachers, 1
computer systems anaysts 1
counseling psychologist 1
software engineer, senior 1
health diagnosing & treating practitioners, all ot 1
set and exhibit engineers 1
helpers, construction trades, all other 1
datebase administrators 1
commercial and insdistrial designers 1
software developers, appllications 1
computer occupations,all other 1
foreign language and literature teachers, postsecondary 1
dentists 1
railroad conductors and yardmasters 1
computer occupations, all other: information techn 1
computer network architect 1
criminal justice and law enforcement teachers, postsecondary 1
medical scientists, except epidemiologist 1

1585 rows × 1 columns


In [11]:
df['New_Name'] = ''

In [ ]:
df


Out[ ]:
Count New_Name
SOC_NAME
computer systems analysts 506523
software developers, applications 414716
computer programmers 398546
computer occupations, all other 177870
software developers, systems software 83856
management analysts 66873
financial analysts 53540
accountants and auditors 52822
mechanical engineers 44236
network and computer systems administrators 40087
database administrators 38982
market research analysts and marketing specialists 37737
electronics engineers, except computer 36574
operations research analysts 34260
electrical engineers 34108
physicians and surgeons, all other 33526
computer software engineers, applications 33387
computer and information systems managers 27536
computer occupations, all other* 26254
medical scientists, except epidemiologists 26159
physical therapists 21994
biochemists and biophysicists 21245
industrial engineers 19370
computer systems analyst 17426
statisticians 17101
biological scientists, all other 16367
marketing managers 16310
civil engineers 15970
web developers 15000
internists, general 13367
... ... ...
helpers--electricians 1
industrial designers 1
logistics manager 1
electronics engineers, expert computer 1
hardward engineers 1
soil and plant scientist 1
mechanical engineers, r&d (acwia only) 1
timing device assemblers and adjusters 1
telecommunications line installers and repairers 1
biological science teachers 1
dentist 1
computer systems engineers/arquitects 1
secondary school teachers, 1
computer systems anaysts 1
counseling psychologist 1
software engineer, senior 1
health diagnosing & treating practitioners, all ot 1
set and exhibit engineers 1
helpers, construction trades, all other 1
datebase administrators 1
commercial and insdistrial designers 1
software developers, appllications 1
computer occupations,all other 1
foreign language and literature teachers, postsecondary 1
dentists 1
railroad conductors and yardmasters 1
computer occupations, all other: information techn 1
computer network architect 1
criminal justice and law enforcement teachers, postsecondary 1
medical scientists, except epidemiologist 1

1585 rows × 2 columns


In [ ]:
%timeit
for index, row in testdata.iterrows():
    if(str(row['SOC_NAME']).endswith('*')):
        row['SOC_NAME']=str(row['SOC_NAME'])[:-1]
    if not (str(row['SOC_NAME']).endswith('s')):
        row['SOC_NAME']=str(row['SOC_NAME'])+'s'
    #if row=='software developers, appllications': #in progress
        #print(row)

In [ ]:
(testdata.loc[(testdata['SOC_NAME']=='software developers, appllications')]) #an example of a query

This person messed up the SOC_NAME